The data used for this project was provided by TATA through theforage.com virtual experience platform. This is an analysis of a retail store data to give actionable insights that would assist business decision-making.
The Data
InvoiceNO: invoice code for purchase made
StockCode: the code of the object purchased when in stock
Description: the description of item purchased
Quantity: the amount of the product purchased
InvoiceDate: the year,month, date, hour, minute and seconds of item purchase
Unitprice: the cost for one of the product
CustomerID: a number representing and tagging a particular customer
Country: the country where purchase was made.
# Importing libraries
library(tidyverse)
library(readxl)
library(rnaturalearth)
library(gridExtra)
library(plotly)Next, I imported the data which is in xlsx format. The file is fairly large and can be downloaded at ‘https://github.com/xrander/online_retail_store_project/blob/master/Online%20Retail.xlsx’.
#import data
online_store <- read_csv("https://raw.githubusercontent.com/xrander/online_retail_store_project/master/online_store.csv")## Rows: 541909 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): InvoiceNo, StockCode, Description, Country
## dbl (3): Quantity, UnitPrice, CustomerID
## dttm (1): InvoiceDate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Data types do change after converting ‘xlsx’ to ‘csv’, it is therefore necessary to investigate the data before attempting any questions. This can also provide some insight to the data which aids analysis
## InvoiceNo StockCode Description Quantity
## Length:541909 Length:541909 Length:541909 Min. :-80995.00
## Class :character Class :character Class :character 1st Qu.: 1.00
## Mode :character Mode :character Mode :character Median : 3.00
## Mean : 9.55
## 3rd Qu.: 10.00
## Max. : 80995.00
##
## InvoiceDate UnitPrice CustomerID
## Min. :2010-12-01 08:26:00.00 Min. :-11062.06 Min. :12346
## 1st Qu.:2011-03-28 11:34:00.00 1st Qu.: 1.25 1st Qu.:13953
## Median :2011-07-19 17:17:00.00 Median : 2.08 Median :15152
## Mean :2011-07-04 13:34:57.16 Mean : 4.61 Mean :15288
## 3rd Qu.:2011-10-19 11:27:00.00 3rd Qu.: 4.13 3rd Qu.:16791
## Max. :2011-12-09 12:50:00.00 Max. : 38970.00 Max. :18287
## NA's :135080
## Country
## Length:541909
## Class :character
## Mode :character
##
##
##
##
## spc_tbl_ [541,909 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ InvoiceNo : chr [1:541909] "536365" "536365" "536365" "536365" ...
## $ StockCode : chr [1:541909] "85123A" "71053" "84406B" "84029G" ...
## $ Description: chr [1:541909] "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : num [1:541909] 6 6 8 6 6 2 6 6 6 32 ...
## $ InvoiceDate: POSIXct[1:541909], format: "2010-12-01 08:26:00" "2010-12-01 08:26:00" ...
## $ UnitPrice : num [1:541909] 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID : num [1:541909] 17850 17850 17850 17850 17850 ...
## $ Country : chr [1:541909] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
## - attr(*, "spec")=
## .. cols(
## .. InvoiceNo = col_character(),
## .. StockCode = col_character(),
## .. Description = col_character(),
## .. Quantity = col_double(),
## .. InvoiceDate = col_datetime(format = ""),
## .. UnitPrice = col_double(),
## .. CustomerID = col_double(),
## .. Country = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
online_store$CustomerID <- as.factor(online_store$CustomerID)# changing data types
# The POSIXct data type for InvoiceDate have been changed to character when file was converted.
online_store <- online_store %>%
mutate(InvoiceDate = as.POSIXct(InvoiceDate, format = "%Y-%m-%d %H:%M:%S")) # Changing character type to Posixct# We need to estimate the revenue generated first before estimating revenue per region
online_store <- online_store %>%
mutate(Revenue = UnitPrice * Quantity)
regional_sales <- online_store %>%
select(Country, Revenue, CustomerID) %>%
filter(Country != 'United Kingdom') %>%
group_by(Country) %>%
summarize(total_revenue = sum(Revenue),
average_revenue = mean(Revenue) #Estimates of the revenue per region
)The United Kingdom is having more than twice the revenue of the second place country and is excluded from the analysis of this question moving forward
# Estimating the 10 highest earning regions
top_10 <- regional_sales %>% top_n(10, wt = total_revenue)
top_countries <- ggplot(top_10,
aes(reorder(Country, total_revenue),
total_revenue/10000))+
geom_bar(aes(fill = Country),
stat = 'identity')+
labs(title = 'Top Revenue Generating regions Excluding the UK',
x = 'Country',
y = 'Revenue generated in tens of thousands')+
theme(plot.title = element_text(face = 'bold'),
axis.title.x = element_text(face = 'bold'),
axis.title.y = element_text(face = 'bold'))
# Estimating the 10 lowest revenue generating regions
bottom_10 <- regional_sales %>% top_n(-10, total_revenue)
least_countries <- ggplot(bottom_10,
aes(reorder(Country, total_revenue),
total_revenue))+
geom_bar(aes(fill = Country),
stat = 'identity')+
labs(title = 'Least Revenue Generating regions',
x = 'Country',
y ='Revenue generated') +
theme(plot.title = element_text(face = 'bold'),
axis.title.x = element_text(face = 'bold'),
axis.title.y = element_text(face = 'bold'))
lc <- ggplotly(least_countries)
tc <- ggplotly(top_countries)
lc# For this analysis we need to extract the months and date from InvoiceDate
online_store <- online_store %>%
mutate(month_num = as.integer(format(InvoiceDate, format ='%m')),
year = factor(format(InvoiceDate, format = '%Y'),
levels = c(2010, 2011)),
month = factor(month.abb[month_num],
levels = c('Jan', 'Feb', 'Mar',
'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep',
'Oct', 'Nov', 'Dec')))
monthly_revenue <- online_store %>%
select(month, year, Revenue) %>%
group_by(year, month) %>%
summarize(total_revenue = sum(Revenue),
average_revenue = mean(Revenue),) %>%
mutate(percent_change = (total_revenue - lag(total_revenue))/lag(total_revenue) * 100,
month_num = as.integer(month))## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
mnth_chn <- ggplot(monthly_revenue, aes(month, percent_change))+
geom_bar(aes(fill = month),
na.rm = T,
stat = 'identity')
ggplotly(mnth_chn)## Warning: Removed 2 rows containing missing values (`position_stack()`).
mnth_in <- ggplot(monthly_revenue, aes(month, total_revenue)) +
geom_bar(aes(fill = year),stat = 'identity') +
scale_fill_manual(values = c('lightgreen', 'darkgreen'))
ggplotly(mnth_in)cp <- online_store %>%
select(CustomerID, Revenue) %>%
filter(!is.na(CustomerID)) %>%
group_by(CustomerID) %>%
summarize(number_of_purchase = length(CustomerID),
total_purchase = sum(Revenue)) # This returns all the revenue generated from each customers and the number of times they made a purchase
# To get the top 10 customers that spent the most
top_10_cp <- cp %>% top_n(10, wt = total_purchase)
tp_10_customer<- top_10_cp %>%
ggplot(aes(reorder(CustomerID, total_purchase/10000), y = total_purchase/10000, fill = CustomerID)) +
geom_bar(stat = 'identity')+
labs(title = 'Top spending customers',
x = 'Customer ID',
y = 'Amount spent in ten thousands')+
theme(plot.title = element_text(face = 'bold'),
axis.title.x = element_text(face = 'bold'),
axis.title.y = element_text(face = 'bold'))
ggplotly(tp_10_customer)The identity of the top customers
## [1] 12415 13694 14156 14646 14911 15311 16684 17450 17511 18102
## 4372 Levels: 12346 12347 12348 12349 12350 12352 12353 12354 12355 ... 18287
The contribution of the top customers to the total revenue
# Proportion of top customers in the total purchase
sum(top_10_cp$total_purchase)/sum(online_store$Revenue) * 100## [1] 14.04613
14% of the total revenue is by the top customers and this implies that the business is not diversified as losing this customers which is less than 0.0022873 will lead to a 14% drop in revenue. Ideally, we would want to have a larger number of customers contributing smaller amounts to our revenue, so that we are not overly reliant on any one customer or group of customers.
# Amount generated from customers with more than one order
online_store %>% select(CustomerID, Revenue) %>%
filter(length(CustomerID)>1) %>%
summarize(revenue = sum(Revenue))